insurance<- read.csv(file.choose(), header = TRUE,sep=";",na.strings = c("", " "))
View(insurance)
attach(insurance)
#### convert chr columns to factors:
columns_to_convert <- c("sex","smoker", "region")
insurance[columns_to_convert] <- lapply(insurance[columns_to_convert], factor)
summary(insurance)
## age sex bmi children smoker
## Min. :18.00 female:662 Min. :15.96 Min. :0.000 no :1064
## 1st Qu.:27.00 male :676 1st Qu.:26.30 1st Qu.:0.000 yes: 274
## Median :39.00 Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## region charges
## northeast:324 Min. : 1122
## northwest:325 1st Qu.: 4740
## southeast:364 Median : 9382
## southwest:325 Mean :13270
## 3rd Qu.:16640
## Max. :63770
str(insurance)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
## $ charges : num 16885 1726 4449 21984 3867 ...
names(insurance)
## [1] "age" "sex" "bmi" "children" "smoker" "region" "charges"
dim(insurance)
## [1] 1338 7
head(insurance)
## age sex bmi children smoker region charges
## 1 19 female 27.900 0 yes southwest 16884.924
## 2 18 male 33.770 1 no southeast 1725.552
## 3 28 male 33.000 3 no southeast 4449.462
## 4 33 male 22.705 0 no northwest 21984.471
## 5 32 male 28.880 0 no northwest 3866.855
## 6 31 female 25.740 0 no southeast 3756.622
tail(insurance)
## age sex bmi children smoker region charges
## 1333 52 female 44.70 3 no southwest 11411.685
## 1334 50 male 30.97 3 no northwest 10600.548
## 1335 18 female 31.92 0 no northeast 2205.981
## 1336 18 female 36.85 0 no southeast 1629.833
## 1337 21 female 25.80 0 no southwest 2007.945
## 1338 61 female 29.07 0 yes northwest 29141.360
na_tot<- sum(is.na(insurance))
na_tot_col<- colSums(is.na(insurance))
print(na_tot)
## [1] 0
print(na_tot_col)
## age sex bmi children smoker region charges
## 0 0 0 0 0 0 0
quantitative_columns <- c("age", "bmi", "children", "charges")
calculate_z_scores <- function(x) {
(x - mean(x)) / sd(x)
}
z_scores <- lapply(insurance[quantitative_columns], calculate_z_scores)
# Identifying outliers (threshold: ±3)
outliers <- apply(abs(do.call(cbind, z_scores)), 1, max) > 3
# Displaying rows with outliers
outliers_data <- insurance[outliers, ]
print(outliers_data)
## age sex bmi children smoker region charges
## 33 19 female 28.600 5 no southwest 4687.797
## 35 28 male 36.400 1 yes southwest 51194.559
## 72 31 male 28.500 5 no northeast 6799.458
## 117 58 male 49.060 0 no southeast 11381.325
## 167 20 female 37.000 5 no southwest 4830.630
## 414 25 male 23.900 5 no southwest 5080.096
## 426 45 male 24.310 5 no southeast 9788.866
## 439 52 female 46.750 5 no southeast 12592.534
## 544 54 female 47.410 0 yes southeast 63770.428
## 569 49 female 31.900 5 no southwest 11552.904
## 578 31 female 38.095 1 yes northeast 58571.074
## 641 33 male 42.400 5 no southwest 6666.243
## 820 33 female 35.530 0 yes northwest 55135.402
## 848 23 male 50.380 1 no southeast 2438.055
## 878 33 male 33.440 5 no southeast 6653.789
## 933 46 male 25.800 5 no southwest 10096.970
## 938 39 female 24.225 5 no northwest 8965.796
## 970 39 female 34.320 5 no southeast 8596.828
## 985 20 male 30.115 5 no northeast 4915.060
## 1048 22 male 52.580 1 yes southeast 44501.398
## 1086 39 female 18.300 5 yes southwest 19023.260
## 1117 41 male 29.640 5 no northeast 9222.403
## 1131 39 female 23.870 5 no southeast 8582.302
## 1147 60 male 32.800 0 yes southwest 52590.829
## 1231 52 male 34.485 3 yes northwest 60021.399
## 1246 28 male 24.300 5 no southwest 5615.369
## 1273 43 male 25.520 5 no southeast 14478.330
## 1301 45 male 30.360 0 yes southeast 62592.873
## 1318 18 male 53.130 0 no southeast 1163.463
par(mfrow = c(2, 2))
quantitative_vars <- sapply(insurance, is.numeric)
numeric_data <- insurance[, quantitative_vars]
#boxplot pour les variables quantitatives
library(plotly)
##
## Attachement du package : 'plotly'
## L'objet suivant est masqué depuis 'package:ggplot2':
##
## last_plot
## L'objet suivant est masqué depuis 'package:stats':
##
## filter
## L'objet suivant est masqué depuis 'package:graphics':
##
## layout
boxplots <- lapply(colnames(numeric_data), function(variable) {
plot_ly(data = numeric_data, y = ~get(variable), type = "box", name = variable) %>%
layout(title = paste("Boxplot of", variable))
})
subplot(boxplots)
par(mfrow = c(1, 1))
summary(insurance[c("age", "bmi", "children", "charges")])
## age bmi children charges
## Min. :18.00 Min. :15.96 Min. :0.000 Min. : 1122
## 1st Qu.:27.00 1st Qu.:26.30 1st Qu.:0.000 1st Qu.: 4740
## Median :39.00 Median :30.40 Median :1.000 Median : 9382
## Mean :39.21 Mean :30.66 Mean :1.095 Mean :13270
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000 3rd Qu.:16640
## Max. :64.00 Max. :53.13 Max. :5.000 Max. :63770
par(mfrow = c(2, 2))
ggplot(insurance, aes(x = sex, fill = sex)) +
geom_bar() +
labs(title = "Bar Plot of Sex", x = "Sex", y = "Count") +
theme_minimal()
ggplot(insurance, aes(x = region, fill = sex)) +
geom_bar() +
labs(title = "Bar Plot of Region", x = "Region", y = "Count") +
theme_minimal()
ggplot(insurance, aes(x = smoker, fill = smoker)) +
geom_bar() +
labs(title = "Bar Plot of Smoker", x = "smoker", y = "Count") +
theme_minimal()
par(mfrow = c(1, 1))
any(duplicated(insurance))
## [1] TRUE
sum(duplicated(insurance))
## [1] 1
insurance[duplicated(insurance) | duplicated(insurance, fromLast = TRUE), ]
## age sex bmi children smoker region charges
## 196 19 male 30.59 0 no northwest 1639.563
## 582 19 male 30.59 0 no northwest 1639.563
insurance<-unique(insurance)
any(duplicated(insurance))
## [1] FALSE
summary(age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 27.00 39.00 39.21 51.00 64.00
boxplot(age, main = "Boxplot de la variable Age", ylab = "Age")
hist(age, main = "Histogram de la variable Age", xlab = "Age")
ggplot(data = insurance, aes(x = age)) +
geom_density(fill = "blue", alpha = 0.5) +
stat_function(fun = dnorm, args = list(mean = mean(insurance$age), sd = sd(insurance$age)), color = "red", size = 1) +
labs(title = "Density Plot with Normal Distribution",
x = "Age",
y = "Density") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
qqnorm(age)
qqline(age, col = 2)
shapiro.test(age)
##
## Shapiro-Wilk normality test
##
## data: age
## W = 0.9447, p-value < 2.2e-16
Higher BMI is correlated to higher body fat and thus, also correlated to metabolic diseases. BMI is thus a measure of body weight status. For adults, BMI below 18.5 is considered underweight, BMI of 18.5 – 24.9 is considered normal, BMI of 25.0 – 29.9 is considered overweight while BMI above 30 is considered obese.
summary(bmi)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.96 26.30 30.40 30.66 34.69 53.13
boxplot(bmi, main = "Boxplot de la variable bmi", ylab = "bmi")
hist(age, main = "Histogram de la variable bmi", xlab = "bmi")
ggplot(data = insurance, aes(x = bmi)) +
geom_density(fill = "blue", alpha = 0.5) +
stat_function(fun = dnorm, args = list(mean = mean(insurance$bmi), sd = sd(insurance$bmi)), color = "red", size = 1) +
labs(title = "Density Plot with Normal Distribution",
x = "BMI",
y = "Density") +
theme_minimal()
qqnorm(bmi)
qqline(bmi, col = 2)
shapiro.test(bmi)
##
## Shapiro-Wilk normality test
##
## data: bmi
## W = 0.99389, p-value = 2.605e-05
summary(children)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.095 2.000 5.000
boxplot(children, main = "Boxplot de la variable children", ylab = "children")
hist(children, main = "Histogram de la variable children", xlab = "children")
ggplot(data = insurance, aes(x = children)) +
geom_density(fill = "blue", alpha = 0.5) +
stat_function(fun = dnorm, args = list(mean = mean(insurance$children), sd = sd(insurance$children)), color = "red", size = 1) +
labs(title = "Density Plot with Normal Distribution",
x = "Children",
y = "Density") +
theme_minimal()
qqnorm(children)
qqline(children, col = 2)
shapiro.test(children)
##
## Shapiro-Wilk normality test
##
## data: children
## W = 0.82318, p-value < 2.2e-16
summary(charges)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1122 4740 9382 13270 16640 63770
boxplot(charges, main = "Boxplot de la variable Charges", ylab = "Charges")
hist(charges, main = "Histogram de la variable Charges", xlab = "Charges")
ggplot(data = insurance, aes(x = charges)) +
geom_density(fill = "blue", alpha = 0.5) +
stat_function(fun = dnorm, args = list(mean = mean(insurance$charges), sd = sd(insurance$charges)), color = "red", size = 1) +
labs(title = "Density Plot with Normal Distribution",
x = "charges",
y = "Density") +
theme_minimal()
qqnorm(charges)
qqline(charges, col = 2)
shapiro.test(charges)
##
## Shapiro-Wilk normality test
##
## data: charges
## W = 0.81469, p-value < 2.2e-16
summary(insurance$sex)
## female male
## 662 675
ggplot(insurance, aes(x = sex, fill = sex)) +
geom_bar() +
labs(title = "Bar Plot of Sex", x = "Sex", y = "Count") +
theme_minimal()
df <- insurance %>%
group_by(sex) %>% # Variable to be transformed
count() %>%
ungroup() %>%
mutate(perc = `n` / sum(`n`)) %>%
arrange(perc) %>%
mutate(labels = scales::percent(perc))
ggplot(df, aes(x = "", y = "", fill = sex)) +
geom_col() +
geom_label(aes(label = labels),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
coord_polar(theta = "y")
summary(insurance$smoker)
## no yes
## 1063 274
ggplot(insurance, aes(x = smoker, fill = smoker)) +
geom_bar() +
labs(title = "Bar Plot of Smoker", x = "Sex", y = "Count") +
theme_minimal()
df2<- insurance %>%
group_by(smoker) %>% # Variable to be transformed
count() %>%
ungroup() %>%
mutate(perc1 = `n` / sum(`n`)) %>%
arrange(perc1) %>%
mutate(labels = scales::percent(perc1))
ggplot(df2, aes(x = "", y = perc1, fill = smoker)) +
geom_col(color = "black") +
geom_label(aes(label = labels), color = c("white", "white"),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
guides(fill = guide_legend(title = "Smoker")) +
scale_fill_viridis_d() +
coord_polar(theta = "y") +
theme_void()
summary(insurance$region)
## northeast northwest southeast southwest
## 324 324 364 325
ggplot(insurance, aes(x = region, fill = region)) +
geom_bar() +
labs(title = "Bar Plot of region", x = "Region", y = "Count") +
theme_minimal()
df3 <- insurance %>%
group_by(region) %>% # Variable to be transformed
count() %>%
ungroup() %>%
mutate(perc3 = `n` / sum(`n`)) %>%
arrange(perc3) %>%
mutate(labels = scales::percent(perc3))
ggplot(df3, aes(x = "", y = "", fill = region)) +
geom_col() +
geom_label(aes(label = labels),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
coord_polar(theta = "y")
Commençons par une inspection graphique:
ggplot(insurance, aes(x = age, y = bmi)) +
geom_point(alpha = 0.7, color = "red", size = 3) +
labs(title = "Scatter Plot Age vs BMI",
x = "Age",
y = "BMI") +
theme_minimal() +
xlim(c(min(insurance$age) - 5, max(insurance$age) + 5)) +
ylim(c(min(insurance$bmi) - 5, max(insurance$bmi) + 5))
**La dépendance entre l'âge et le BMI est faible ou nulle.**
cor(age,bmi,method = "spearman")
## [1] 0.107736
cor.test(age, bmi, method ="spearman")
## Warning in cor.test.default(age, bmi, method = "spearman"): Impossible de
## calculer la p-value exacte avec des ex-aequos
##
## Spearman's rank correlation rho
##
## data: age and bmi
## S = 356213358, p-value = 7.859e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.107736
Le test de signficativité assure l’existence d’une faible liaison monotone statistiquement signficative (de l’ordre de 10%).
ggplot(insurance, aes(x =children, y = age)) +
geom_point(alpha = 0.7, color = "red", size = 3) +
labs(title = "Scatter Plot Age vs Children",
x = "Age",
y = "Children") +
theme_minimal()
**Les variables âge et Children ne sont pas corrélées**
cor(age,children,method = "spearman")
## [1] 0.05699222
cor.test(age, children, method ="spearman")
## Warning in cor.test.default(age, children, method = "spearman"): Impossible de
## calculer la p-value exacte avec des ex-aequos
##
## Spearman's rank correlation rho
##
## data: age and children
## S = 376471515, p-value = 0.03712
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.05699222
la valeur du coefficient de corrélation entre les variables étudiées est presque nulle.
Le test de signficativité assure l’existence d’une dépendance statistiquement signficative mais très faible entre les variables.
**Les variables âge et Children ne sont pas corrélées**ggplot(insurance, aes(x = age, y = charges)) +
geom_point(alpha = 0.7, color = "red", size = 3) +
labs(title = "Scatter Plot Age vs Charges",
x = "Age",
y = "Charges") +
theme_minimal()
La forme de la liaison: On peut détécter l’existence d’une liaison moyennement monotonne entre les deux variables.
Le sens de la liaison:L’âge et la charge évoluent dans le même sens, c’est une liaison positive( plus l’âge augmente plus la charge augmente).
L’intensité de la liaison: Les points sont moyennement concentrés.
**C'est une liaison monotone positive moyennement forte**cor(age,charges,method = "spearman")
## [1] 0.5343921
cor.test(age, charges, method ="spearman")
## Warning in cor.test.default(age, charges, method = "spearman"): Impossible de
## calculer la p-value exacte avec des ex-aequos
##
## Spearman's rank correlation rho
##
## data: age and charges
## S = 185881923, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.5343921
ggplot(insurance, aes(x = age, y = charges, color = smoker)) +
geom_point() +
facet_grid(sex ~ ., scales = "free") +
labs(title = "Relationship between Age, Charges, and Smoking Status",
x = "Age", y = "Charges", color = "Smoker") +
theme_minimal()
ggplot(insurance, aes(x =children, y = bmi)) +
geom_point(alpha = 0.7, color = "red", size = 3) +
labs(title = "Scatter Plot BMI vs Children",
x = "BMI",
y = "Children") +
theme_minimal()
**Les variables BMI et Children ne sont pas corrélées**
cor(bmi,children,method = "spearman")
## [1] 0.01560674
cor.test(age, children, method ="spearman")
## Warning in cor.test.default(age, children, method = "spearman"): Impossible de
## calculer la p-value exacte avec des ex-aequos
##
## Spearman's rank correlation rho
##
## data: age and children
## S = 376471515, p-value = 0.03712
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.05699222
la valeur du coefficient de corrélation entre les variables étudiées est presque nulle.
Le test de signficativité assure l’existence d’une dépendance statistiquement signficative entre les variables étudiées.
**Les variables BMI et Children ne sont pas corrélées**ggplot(insurance, aes(x=bmi, y = charges)) +
geom_point(alpha = 0.7, color = "red", size = 3) +
labs(title = "Scatter Plot BMI vs Charges",
x = "BMI",
y = "Charges") +
theme_minimal()
**C'est une liaison positive, faiblement monotone et moyennement intense**
cor(bmi,charges,method = "spearman")
## [1] 0.1193959
cor.test(bmi, charges, method ="spearman")
## Warning in cor.test.default(bmi, charges, method = "spearman"): Impossible de
## calculer la p-value exacte avec des ex-aequos
##
## Spearman's rank correlation rho
##
## data: bmi and charges
## S = 351558456, p-value = 1.193e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1193959
la valeur du coefficient de corrélation entre les variables étudiées est faible.
Le test de signficativité assure l’existence d’une dépendence statistiquement signficative (on rejette H0) qui demeure faible entre les variables.
**Les variables BMI et Charges sont faiblement corrélées**ggplot(insurance, aes(x = bmi, y = charges, color = smoker)) +
geom_point() +
labs(title = "Relationship between BMI, Charges, and Smoking Status",
x = "BMI", y = "Charges", color = "Smoker") +
theme_minimal()
ggplot(insurance, aes(x =children, y = charges)) +
geom_point(alpha = 0.7, color = "red", size = 3) +
labs(title = "Scatter Plot Children vs charges",
x = "children",
y = "Carges") +
theme_minimal()
cor(children,charges,method = "spearman")
## [1] 0.1333389
cor.test(charges, children, method ="spearman")
## Warning in cor.test.default(charges, children, method = "spearman"): Impossible
## de calculer la p-value exacte avec des ex-aequos
##
## Spearman's rank correlation rho
##
## data: charges and children
## S = 345992058, p-value = 9.847e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1333389
la valeur du coefficient de corrélation entre les variables étudiées est faible.
Le test de signficativité assure l’existence d’une dépendance statistiquement signficative on rejette (H0 et on accepte H1) .
**Les variables Children et Charges sont très faiblement corrélées**boxplot(charges~region)
tapply(charges,region,shapiro.test)
## $northeast
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.83534, p-value < 2.2e-16
##
##
## $northwest
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.8128, p-value < 2.2e-16
##
##
## $southeast
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.82423, p-value < 2.2e-16
##
##
## $southwest
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.7843, p-value < 2.2e-16
bartlett.test(charges~region)
##
## Bartlett test of homogeneity of variances
##
## data: charges by region
## Bartlett's K-squared = 25.883, df = 3, p-value = 1.009e-05
kruskal.test(charges~region)
##
## Kruskal-Wallis rank sum test
##
## data: charges by region
## Kruskal-Wallis chi-squared = 4.7342, df = 3, p-value = 0.1923
boxplot(charges~smoker)
Independence: Les observations dans chaque sous groupe sont indépendantes les unes des autres.
Normalité:
tapply(charges,smoker,shapiro.test)
## $no
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.87286, p-value < 2.2e-16
##
##
## $yes
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.93955, p-value = 3.625e-09
bartlett.test(charges~smoker)
##
## Bartlett test of homogeneity of variances
##
## data: charges by smoker
## Bartlett's K-squared = 230.33, df = 1, p-value < 2.2e-16
wilcox.test(charges~smoker)
##
## Wilcoxon rank sum test with continuity correction
##
## data: charges by smoker
## W = 7403, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
boxplot(charges~sex)
Independence: Les observations dans chaque sous groupe sont indépendantes les unes des autres.
Normalité:
tapply(charges,sex,shapiro.test)
## $female
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.80539, p-value < 2.2e-16
##
##
## $male
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.82281, p-value < 2.2e-16
bartlett.test(charges~sex)
##
## Bartlett test of homogeneity of variances
##
## data: charges by sex
## Bartlett's K-squared = 15.585, df = 1, p-value = 7.887e-05
wilcox.test(charges~sex)
##
## Wilcoxon rank sum test with continuity correction
##
## data: charges by sex
## W = 221304, p-value = 0.7287
## alternative hypothesis: true location shift is not equal to 0
insurance$sex_smoker <- interaction(insurance$sex, insurance$smoker, sep = "")
attach(insurance)
## Les objets suivants sont masqués depuis insurance (pos = 4):
##
## age, bmi, charges, children, region, sex, smoker
View(insurance)
boxplot(charges~sex_smoker)
tapply(charges,sex_smoker,shapiro.test)
## $femaleno
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.86623, p-value < 2.2e-16
##
##
## $maleno
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.87393, p-value < 2.2e-16
##
##
## $femaleyes
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.93114, p-value = 1.686e-05
##
##
## $maleyes
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.93811, p-value = 2.08e-06
bartlett.test(charges~sex_smoker)
##
## Bartlett test of homogeneity of variances
##
## data: charges by sex_smoker
## Bartlett's K-squared = 228.72, df = 3, p-value < 2.2e-16
kruskal.test(charges~sex_smoker)
##
## Kruskal-Wallis rank sum test
##
## data: charges by sex_smoker
## Kruskal-Wallis chi-squared = 592.14, df = 3, p-value < 2.2e-16
L’indépendance des observations.
L’échantillon étudié a une taille suffisamment grande (n>30) de plus chaque cellule de du tableau de contingence doit avoir un effectif attendu suffisant (>5).
*H0 : Les variables sont indépendantes.
*H1 : Il existe une liaison entre les deux variables.
tab_contingence<- table(insurance$sex, insurance$smoker)
print(tab_contingence)
##
## no yes
## female 547 115
## male 516 159
chi_test <- chisq.test(tab_contingence)
print(chi_test)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: tab_contingence
## X-squared = 7.4691, df = 1, p-value = 0.006277
mosaicplot(table(sex, smoker), main = "Mosaic Plot of Sex and Smoker")
insurance_modif3<-insurance
insurance_modif3_encoded <- dummy_cols(insurance_modif3, select_columns = "sex", remove_first_dummy = TRUE)
insurance_modif3_encoded <- subset(insurance_modif3_encoded, select = -c(sex))
insurance_modif3_encoded <- dummy_cols(insurance_modif3_encoded, select_columns = "smoker", remove_first_dummy = TRUE)
insurance_modif3_encoded <- subset(insurance_modif3_encoded, select = -c(smoker))
insurance_modif3_encoded <- dummy_cols(insurance_modif3_encoded, select_columns = "region", remove_first_dummy = TRUE)
insurance_modif3_encoded <- subset(insurance_modif3_encoded, select = -c(region))
insurance_encoded<-insurance_modif3_encoded[, -5]
View(insurance_encoded)
modele_1<-lm(charges~.,data=insurance_encoded)
summary(modele_1)
##
## Call:
## lm(formula = charges ~ ., data = insurance_encoded)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11305.1 -2850.3 -979.9 1395.0 29992.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11936.56 988.23 -12.079 < 2e-16 ***
## age 256.76 11.91 21.555 < 2e-16 ***
## bmi 339.25 28.61 11.857 < 2e-16 ***
## children 474.82 137.90 3.443 0.000593 ***
## sex_male -129.48 333.20 -0.389 0.697630
## smoker_yes 23847.33 413.35 57.693 < 2e-16 ***
## region_northwest -349.23 476.82 -0.732 0.464053
## region_southeast -1035.27 478.87 -2.162 0.030804 *
## region_southwest -960.08 478.11 -2.008 0.044836 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6064 on 1328 degrees of freedom
## Multiple R-squared: 0.7507, Adjusted R-squared: 0.7492
## F-statistic: 500 on 8 and 1328 DF, p-value: < 2.2e-16
modele1<-lm(charges~age+bmi+smoker_yes,data=insurance_encoded)
summary(modele1)
##
## Call:
## lm(formula = charges ~ age + bmi + smoker_yes, data = insurance_encoded)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12415.2 -2974.4 -981.3 1490.3 28972.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11671.69 938.14 -12.44 <2e-16 ***
## age 259.43 11.95 21.71 <2e-16 ***
## bmi 322.64 27.50 11.73 <2e-16 ***
## smoker_yes 23822.18 413.06 57.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6094 on 1333 degrees of freedom
## Multiple R-squared: 0.7473, Adjusted R-squared: 0.7467
## F-statistic: 1314 on 3 and 1333 DF, p-value: < 2.2e-16
predicted_values <- predict(modele1)
residuals <- insurance_encoded$charges- predicted_values
rmse <- sqrt(mean(residuals^2))
print(paste("RMSE modele1:", rmse))
## [1] "RMSE modele1: 6085.34491805181"
print(paste("AIC du modele1 : ", AIC(modele1)))
## [1] "AIC du modele1 : 27104.5114872106"
plot(modele1)
insurance_encoded$age2 <- (insurance_encoded$age)^2
str(insurance_encoded$age2)
## num [1:1337] 361 324 784 1089 1024 ...
insurance_encoded$bmi30 <- ifelse(insurance_encoded$bmi >= 30, 1, 0)
str(insurance_encoded$bmi30)
## num [1:1337] 0 1 1 0 0 0 1 0 0 0 ...
modele2 <- lm(charges ~ age + age2 + smoker*bmi30, data = insurance_encoded)
summary(modele2)
##
## Call:
## lm(formula = charges ~ age + age2 + smoker * bmi30, data = insurance_encoded)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19736.1 -1873.9 -1395.5 -383.9 24013.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.374e+02 1.080e+03 0.868 0.38557
## age 8.854e+01 5.837e+01 1.517 0.12954
## age2 2.264e+00 7.289e-01 3.106 0.00194 **
## smokeryes 1.343e+04 4.490e+02 29.905 < 2e-16 ***
## bmi30 9.667e+01 2.810e+02 0.344 0.73084
## smokeryes:bmi30 1.971e+04 6.175e+02 31.911 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4549 on 1331 degrees of freedom
## Multiple R-squared: 0.8595, Adjusted R-squared: 0.8589
## F-statistic: 1628 on 5 and 1331 DF, p-value: < 2.2e-16
summary(modele2)
##
## Call:
## lm(formula = charges ~ age + age2 + smoker * bmi30, data = insurance_encoded)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19736.1 -1873.9 -1395.5 -383.9 24013.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.374e+02 1.080e+03 0.868 0.38557
## age 8.854e+01 5.837e+01 1.517 0.12954
## age2 2.264e+00 7.289e-01 3.106 0.00194 **
## smokeryes 1.343e+04 4.490e+02 29.905 < 2e-16 ***
## bmi30 9.667e+01 2.810e+02 0.344 0.73084
## smokeryes:bmi30 1.971e+04 6.175e+02 31.911 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4549 on 1331 degrees of freedom
## Multiple R-squared: 0.8595, Adjusted R-squared: 0.8589
## F-statistic: 1628 on 5 and 1331 DF, p-value: < 2.2e-16
print(paste(" AIC du modèle 1: ", AIC(modele1)))
## [1] " AIC du modèle 1: 27104.5114872106"
print(paste(" AIC du modèle 2: ", AIC(modele2)))
## [1] " AIC du modèle 2: 26324.2409224332"
predicted_values1 <- predict(modele1)
residuals1 <- insurance_encoded$charges- predicted_values1
rmse1 <- sqrt(mean(residuals1^2))
print(paste("RMSE modele1:", rmse1))
## [1] "RMSE modele1: 6085.34491805181"
predicted_values2 <- predict(modele2)
residuals2 <- insurance_encoded$charges- predicted_values2
rmse2 <- sqrt(mean(residuals2^2))
print(paste("RMSE modele2:", rmse2))
## [1] "RMSE modele2: 4538.46344871283"
plot(modele2)
Le deuxième modèle obtenu après avoir effectué les 3 modifications est bien meilleur que le premier. Les variables introduites ont pu expliquer les charges de manière plus précise.
La régression ridge ajoute une pénalité égale au carré de la magnitude des coefficients à la fonction de coût. L’objectif est de minimiser la fonction de coût augmentée de cette pénalité. Cela a pour effet de “rétrécir” tous les coefficients, mais de ne jamais les rendre exactement égaux à zéro.
La fonction de coût pour la régression ridge est définie comme suit :
\[ J(\beta) = \sum_{i=1}^{n} (y_i - \beta_0 - \sum_{j=1}^{p} \beta_j x_{ij})^2 + \lambda \sum_{j=1}^{p} \beta_j^2 \]
Ici, \(\lambda\) est le paramètre de pénalité qui contrôle l’ampleur de la pénalité. Plus \(\lambda\) est grand, plus la pénalité est forte.
La régression lasso ajoute une pénalité égale à la magnitude absolue des coefficients à la fonction de coût. Cela a tendance à pousser certains coefficients exactement à zéro, réalisant ainsi une sélection automatique des variables.
La fonction de coût pour la régression lasso est définie comme suit :
\[ J(\beta) = \sum_{i=1}^{n} (y_i - \beta_0 - \sum_{j=1}^{p} \beta_j x_{ij})^2 + \lambda \sum_{j=1}^{p} |\beta_j| \]
Ici aussi, \(\lambda\) est le paramètre de pénalité.
# Préparer les données
X <- model.matrix(charges ~ age + age2 + smoker*bmi30, data = insurance_encoded)
y <- insurance_encoded$charges
# Ajuster le modèle lasso
modele_lasso <- cv.glmnet(X, y, alpha = 1)
coef(modele_lasso, s = "lambda.1se")
## 7 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3462.965797
## (Intercept) .
## age 62.359784
## age2 1.719159
## smokeryes 11844.470610
## bmi30 .
## smokeryes:bmi30 18066.612664
plot(modele_lasso)
lasso_coef <- coef(modele_lasso)
cat("\nLasso Coefficients:\n")
##
## Lasso Coefficients:
print(lasso_coef)
## 7 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3462.965797
## (Intercept) .
## age 62.359784
## age2 1.719159
## smokeryes 11844.470610
## bmi30 .
## smokeryes:bmi30 18066.612664
# Ajuster le modèle ridge
modele_ridge <- cv.glmnet(X, y, alpha = 0)
coef(modele_ridge, s = "lambda.1se")
## 7 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1459.062416
## (Intercept) .
## age 116.914244
## age2 1.502311
## smokeryes 12092.317773
## bmi30 683.813031
## smokeryes:bmi30 16486.333789
plot(modele_ridge)
ridge_coef <- coef(modele_lasso)
cat("Ridge Coefficients:\n")
## Ridge Coefficients:
print(ridge_coef)
## 7 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3462.965797
## (Intercept) .
## age 62.359784
## age2 1.719159
## smokeryes 11844.470610
## bmi30 .
## smokeryes:bmi30 18066.612664
df1<-insurance_encoded[, -4]
# Appliquer l'ACP
acp_result <- prcomp(df1, scale. = TRUE)
# Afficher un résumé des résultats
summary(acp_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4713 1.3550 1.1522 1.0901 1.0333 0.99173 0.95352
## Proportion of Variance 0.2165 0.1836 0.1328 0.1188 0.1068 0.09835 0.09092
## Cumulative Proportion 0.2165 0.4001 0.5328 0.6517 0.7584 0.85678 0.94770
## PC8 PC9 PC10
## Standard deviation 0.56588 0.43848 0.10252
## Proportion of Variance 0.03202 0.01923 0.00105
## Cumulative Proportion 0.97972 0.99895 1.00000
# Afficher les valeurs propres
print("Valeurs propres :")
## [1] "Valeurs propres :"
print(acp_result$sdev^2)
## [1] 2.16462300 1.83600877 1.32765665 1.18826198 1.06773626 0.98353046
## [7] 0.90919443 0.32021529 0.19226285 0.01051031
# Afficher les vecteurs propres
print("Vecteurs propres :")
## [1] "Vecteurs propres :"
print(acp_result$rotation)
## PC1 PC2 PC3 PC4
## age 0.507284532 -0.47951195 -0.03279611 0.1037684544
## bmi 0.468614252 0.41576313 0.03877372 -0.2894958199
## children 0.017997653 -0.02734176 0.03314272 -0.1652853971
## sex_male 0.024188301 0.08646041 -0.01111034 0.0009941042
## smoker_yes -0.001736939 0.08373541 -0.09737269 0.2788211926
## region_northwest -0.143900492 -0.23775063 -0.48524103 -0.5886186133
## region_southeast 0.211315921 0.36817948 -0.30497970 0.5422957850
## region_southwest -0.016113820 -0.07696609 0.80887911 -0.0609112637
## age2 0.509633040 -0.47571752 -0.03467896 0.1075799397
## bmi30 0.443625764 0.39706711 0.05251833 -0.3801612212
## PC5 PC6 PC7 PC8
## age 3.239397e-02 -0.007802989 -0.01236745 -0.005791427
## bmi -1.636335e-02 0.037735577 0.07101025 -0.050305599
## children 3.585613e-01 -0.916224411 -0.03264512 0.018213486
## sex_male 6.957371e-01 0.292615689 -0.64970268 -0.005190467
## smoker_yes 5.992403e-01 0.152345273 0.72349350 -0.001709083
## region_northwest 7.460899e-02 0.107427949 0.10316738 -0.558572301
## region_southeast -1.298079e-01 -0.172327304 -0.14891455 -0.587612793
## region_southwest 6.559743e-02 0.054272570 0.07448008 -0.565002609
## age2 1.606102e-02 0.035372132 -0.01100776 -0.005037383
## bmi30 7.340067e-05 0.069143193 0.09846302 0.143279727
## PC9 PC10
## age -0.0014981347 -0.7068155485
## bmi 0.7161871235 0.0064527001
## children -0.0038194999 0.0326404247
## sex_male -0.0004786298 -0.0005667143
## smoker_yes 0.0043073253 0.0005259329
## region_northwest -0.0328954337 -0.0008111475
## region_southeast -0.1366767049 -0.0006855716
## region_southwest -0.0616214534 0.0000395337
## age2 -0.0190587299 0.7065210590
## bmi30 -0.6805272361 -0.0114460272
# Calculer la variance expliquée
var_explained <- acp_result$sdev^2 / sum(acp_result$sdev^2)
# Calculer la variance expliquée cumulée
cum_var_explained <- cumsum(var_explained)
# Créer le graphique de variance expliquée
par(mfrow = c(1, 2))
plot(var_explained, type = "b", main = "Variance expliquée par composante", xlab = "Composante", ylab = "Variance expliquée")
plot(cum_var_explained, type = "b", main = "Variance expliquée cumulée", xlab = "Nombre de composantes", ylab = "Variance expliquée cumulée")
# Calculer la variance expliquée
var_explained <- acp_result$sdev^2 / sum(acp_result$sdev^2)
# Calculer la variance expliquée cumulée
cum_var_explained <- cumsum(var_explained)
# Déterminer l'échelle maximale
max_scale <- max(c(max(var_explained), max(cum_var_explained)))
# Créer un histogramme avec échelle spécifiée
par(mfrow = c(1, 1))
barplot(var_explained, names.arg = seq_along(var_explained), col = "skyblue", main = "Variance expliquée par composante", xlab = "Composante", ylab = "Variance expliquée", ylim = c(0, max_scale))
# Ajouter une ligne représentant la variance expliquée cumulée
lines(cum_var_explained, type = "b", pch = 16, col = "red", lty = 2)
# Sélectionner le nombre de composantes à inclure
nombre_composantes <- 8 # À ajuster en fonction du résumé de l'ACP
# Construire le modèle linéaire avec les composantes principales
modele3 <- lm(charges ~ acp_result$x[, 1:nombre_composantes], data = insurance_modif3_encoded)
# Afficher le résumé du modèle
summary(modele3)
##
## Call:
## lm(formula = charges ~ acp_result$x[, 1:nombre_composantes],
## data = insurance_modif3_encoded)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11828.7 -3327.6 -20.2 1387.4 29020.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13279.1 163.7 81.145 < 2e-16
## acp_result$x[, 1:nombre_composantes]PC1 2848.9 111.3 25.604 < 2e-16
## acp_result$x[, 1:nombre_composantes]PC2 -99.3 120.8 -0.822 0.4113
## acp_result$x[, 1:nombre_composantes]PC3 -1058.2 142.1 -7.448 1.70e-13
## acp_result$x[, 1:nombre_composantes]PC4 2073.0 150.2 13.803 < 2e-16
## acp_result$x[, 1:nombre_composantes]PC5 6036.7 158.4 38.103 < 2e-16
## acp_result$x[, 1:nombre_composantes]PC6 1054.0 165.1 6.385 2.37e-10
## acp_result$x[, 1:nombre_composantes]PC7 7169.7 171.7 41.760 < 2e-16
## acp_result$x[, 1:nombre_composantes]PC8 684.5 289.3 2.366 0.0181
##
## (Intercept) ***
## acp_result$x[, 1:nombre_composantes]PC1 ***
## acp_result$x[, 1:nombre_composantes]PC2
## acp_result$x[, 1:nombre_composantes]PC3 ***
## acp_result$x[, 1:nombre_composantes]PC4 ***
## acp_result$x[, 1:nombre_composantes]PC5 ***
## acp_result$x[, 1:nombre_composantes]PC6 ***
## acp_result$x[, 1:nombre_composantes]PC7 ***
## acp_result$x[, 1:nombre_composantes]PC8 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5984 on 1328 degrees of freedom
## Multiple R-squared: 0.7573, Adjusted R-squared: 0.7559
## F-statistic: 518 on 8 and 1328 DF, p-value: < 2.2e-16
# Prédictions du modèle
predictions_modele3 <- predict(modele3, newdata = insurance_modif3_encoded)
# Évaluation du modèle (exemple avec RMSE)
rmse_modele3 <- sqrt(mean((insurance_modif3_encoded$charges - predictions_modele3)^2))
# Afficher le RMSE
print(paste("RMSE du modele3 : ", rmse_modele3))
## [1] "RMSE du modele3 : 5963.59020276134"
# Afficher AIC
print(paste("AIC du modele3 : ", AIC(modele3)))
## [1] "AIC du modele3 : 27060.4680074105"
plot(modele3)
#### le meilleur modèle est le modèle 3 meilleur aic rmse et graphic
plots, ensuite modele 2 puis model 1.